This past June, I did a sentiment analysis of the Kuczaj Corpus ADD LINK HERE from the CHILDES database for my final project in the Data Visualization class taught by Alison Presmanes Hill, Steven Bedrick, and Jackie Wirz. During my presentation some of the questions that came up were: Does the total number of words per transcript vary a lot? How much is this effecting the sentiment analysis? What would happen to the plot if I normalized for transcript length?
The length of the transcripts does vary a great deal, both before and after processing and filtering against the nrc sentiment lexicon. I had actually noticed this artifact in the dataset when working on the project and had plans to address it. I ended up not having enough time to explore different normalization techniques and included a limitations section discussing how this could affect the visualizations I created.
Now, two months later, I am revisiting these questions and am going to find out what will happen if I normalize for transcript length!
Here are the packages that I will be using:
# Loading packages
library(tidyverse)
library(tidytext)
library(forcats)
library(skimr)
library(egg)
I saved the version of the dataset that I used to create the ridgeline density plots as a csv file so I could pick up where I left off.
# reading in data
kuczaj <- read_csv("data/kuczaj_nrc.csv")
Here’s a glimpse of what kuczaj looks like:
glimpse(kuczaj)
## Observations: 22,893
## Variables: 4
## $ age_months <dbl> 29.00060, 29.00060, 29.00060, 29.00060, 29.00060, 2...
## $ age_years <dbl> 2.416716, 2.416716, 2.416716, 2.416716, 2.416716, 2...
## $ word <chr> "hurt", "hurt", "hurt", "hurt", "break", "cry", "cr...
## $ sentiment <chr> "anger", "fear", "negative", "sadness", "surprise",...
Since I only worked with the “trust”, “joy”, “anticipation”, “sadness”, “fear”, and “anger” sentiments last time, I am going to filter out all other sentiments from the dataframe. I’m also going to coerce sentiment to a factor and will order its levels with the positively associated ones before the negatively associated ones to make plotting easier later on.
# sentiments to keep
sentiment_levels <- c("trust", "joy", "anticipation",
"sadness", "fear", "anger")
# making sentiment a factor
kuczaj <- kuczaj %>%
select(-age_years, -word) %>% # removing unneeded columns
filter(sentiment %in% sentiment_levels) %>%
mutate(sentiment = factor(sentiment, levels = sentiment_levels))
Since there are only partial observations for the first and last month, removing them:
kuczaj <- kuczaj %>%
filter(age_months >= 29 & age_months <= 60)
kuczaj2 <- kuczaj %>%
add_count(age_months, sentiment) %>%
rename(n_sentiment = n) %>%
distinct(age_months, sentiment, .keep_all = TRUE)
# Adding in count for total tokens per transcript that were kept
kuczaj2 <- kuczaj2 %>%
group_by(age_months) %>%
mutate(n_tokens = sum(n_sentiment)) %>%
ungroup()
skim(kuczaj2)
## Skim summary statistics
## n obs: 1192
## n variables: 4
##
## Variable type: factor
## variable missing complete n n_unique
## sentiment 0 1192 1192 6
## top_counts ordered
## joy: 203, ant: 203, tru: 202, sad: 199 FALSE
##
## Variable type: integer
## variable missing complete n mean sd p0 p25 p50 p75 p100
## n_sentiment 0 1192 1192 10.82 7.98 1 5 9 14 60
## n_tokens 0 1192 1192 64.02 30.23 7 45 58 77 174
## hist
## ▇▆▂▁▁▁▁▁
## ▂▆▇▃▂▁▁▁
##
## Variable type: numeric
## variable missing complete n mean sd p0 p25 p50 p75 p100
## age_months 0 1192 1192 42.72 8.46 29 35.43 42.13 49.17 59.89
## hist
## ▇▇▇▇▇▅▃▅
aux <- kuczaj2 %>%
tidyr::expand(nesting(age_months, n_tokens), sentiment)
kuczaj2 <- aux %>%
left_join(kuczaj2, by = c("age_months", "n_tokens", "sentiment")) %>%
replace_na(list(n_sentiment = 0))
kuczaj2 <- kuczaj2 %>%
mutate(percent = n_sentiment/n_tokens)
ggplot(kuczaj2, aes(age_months, percent, color = sentiment)) +
geom_point(alpha = 0.7) +
facet_wrap(~ sentiment) +
labs(x = "Age (months)", y = "Percent") +
guides(color = FALSE) +
theme_minimal()
ages_binned <- kuczaj %>%
mutate(age_months = floor(age_months))
ages_binned <- ages_binned %>%
add_count(age_months, sentiment) %>%
rename(n_sentiment = n) %>%
distinct(age_months, sentiment, .keep_all = TRUE)
# Adding in count for total tokens per transcript that were kept
ages_binned <- ages_binned %>%
group_by(age_months) %>%
mutate(n_tokens = sum(n_sentiment)) %>%
ungroup()
skim(ages_binned)
## Skim summary statistics
## n obs: 186
## n variables: 4
##
## Variable type: factor
## variable missing complete n n_unique
## sentiment 0 186 186 6
## top_counts ordered
## tru: 31, joy: 31, ant: 31, sad: 31 FALSE
##
## Variable type: integer
## variable missing complete n mean sd p0 p25 p50 p75 p100
## n_sentiment 0 186 186 69.34 41.95 10 38 61 90.75 234
## n_tokens 0 186 186 416.06 192.51 124 271 354 614 862
## hist
## ▇▇▆▂▂▁▁▁
## ▃▅▇▂▂▃▂▁
##
## Variable type: numeric
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## age_months 0 186 186 44 8.97 29 36 44 52 59 ▇▇▇▇▆▇▇▇
aux2 <- ages_binned %>%
tidyr::expand(nesting(age_months, n_tokens), sentiment)
ages_binned2 <- aux2 %>%
left_join(ages_binned, by = c("age_months", "n_tokens", "sentiment")) %>%
replace_na(list(n_sentiment = 0))
ages_binned2 <- ages_binned2 %>%
mutate(percent = n_sentiment/n_tokens)
Adding in a column for positive and negative groups
ages_binned2 <- ages_binned2 %>%
mutate(type = ifelse(sentiment %in% c("trust", "joy", "anticipation"),
"positive", "negative"))
ggplot(ages_binned2, aes(age_months, percent, fill = sentiment)) +
geom_col(alpha = 0.7) +
facet_wrap(~ sentiment) +
labs(x = "Age (months)", y = "Percent", title = "Ages binned") +
guides(fill = FALSE) +
theme_minimal()
ggplot(ages_binned2, aes(age_months, percent, fill = sentiment)) +
geom_col(alpha = 0.7) +
facet_grid(sentiment ~ .)+
labs(x = "Age (months)", y = "Percent", title = "Ages binned") +
guides(fill = FALSE) +
theme_minimal()
positive_plot <- ages_binned2 %>%
filter(type == "positive") %>%
ggplot(aes(age_months, percent, fill = sentiment)) +
geom_col(alpha = 0.7) +
facet_wrap(~sentiment, ncol = 1) +
geom_vline(xintercept = c(36, 48), color = "black") +
geom_vline(xintercept = c(29, 59), color = "black", linetype = "dotted") +
# changing appearance
theme_minimal() +
labs(x = "", y = "") +
guides(fill = FALSE) +
scale_fill_manual(values = c("#97B8C7", "#AEC9C3", "#7FCCD3")) +
scale_y_continuous(breaks = c(0.05, 0.15, 0.25),
labels = c("5%", "15%", "25%")) +
scale_x_continuous(breaks = c(29, 36, 48, 59),
labels = c("2.4 yrs", "3 yrs", "4 yrs", "4.9 yrs")) +
theme(strip.text = element_text(size = 13, face = "italic"),
axis.text.x = element_text(face = "italic"),
axis.text.y = element_text(face = "italic"))
negative_plot <- ages_binned2 %>%
filter(type == "negative") %>%
ggplot(aes(age_months, percent, fill = sentiment)) +
geom_col(alpha = 0.7) +
facet_wrap(~sentiment, ncol = 1) +
geom_vline(xintercept = c(36, 48), color = "black") +
geom_vline(xintercept = c(29, 59), color = "black", linetype = "dotted") +
# changing apperance
theme_minimal() +
labs(x = "", y = "") +
guides(fill = FALSE) +
scale_fill_manual(values = c("#21132B", "#4F406E", "#6C7399")) +
scale_y_continuous(breaks = c(0.05, 0.15, 0.25),
labels = c("5%", "15%", "25%")) +
scale_x_continuous(breaks = c(29, 36, 48, 59),
labels = c("2.4 yrs", "3 yrs", "4 yrs", "4.9 yrs")) +
theme(strip.text = element_text(size = 15, face = "italic"),
axis.text.x = element_text(face = "italic"),
axis.text.y = element_text(face = "italic"))
ggarrange(positive_plot, negative_plot, ncol = 2, nrow = 1)
ggplot(ages_binned2, aes(age_months, percent, fill = sentiment)) +
geom_col(alpha = 0.7) +
facet_wrap(~ sentiment) +
geom_vline(xintercept = c(36, 48), color = "black") +
geom_vline(xintercept = c(29, 59), color = "black", linetype = "dotted") +
theme_minimal() +
labs(x = "", y = "") +
guides(fill = FALSE) +
scale_fill_manual(values = c("#97B8C7", "#AEC9C3", "#7FCCD3",
"#21132B", "#4F406E", "#6C7399")) +
scale_y_continuous(breaks = c(0.05, 0.15, 0.25),
labels = c("5%", "15%", "25%")) +
scale_x_continuous(breaks = c(29, 36, 48, 59),
labels = c("2.4 yrs", "3 yrs", "4 yrs", "4.9 yrs")) +
theme(strip.text = element_text(size = 15, face = "bold.italic"),
axis.text.x = element_text(size = 10, face = "italic"),
axis.text.y = element_text(size = 10, face = "italic"))
ggplot(ages_binned2, aes(age_months, percent, color = sentiment, linetype = sentiment)) +
geom_line(size = 1) +
scale_color_manual(values = c("#21132B", "#4F406E", "#6C7399",
"#97B8C7", "#AEC9C3", "#7FCCD3")) +
theme_minimal()